// CuNNy 8x4C BILINEAR RGB NVL - https://github.com/funnyplanter/CuNNy

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
// 
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// 
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
// 
// You should have received a copy of the GNU General Public License
// along with this program.  If not, see <http://www.gnu.org/licenses/>.

//!MAGPIE EFFECT
//!VERSION 4
//!SORT_NAME CuNNy-D04N08

//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!SAMPLER
//!FILTER POINT
SamplerState SP;

//!SAMPLER
//!FILTER LINEAR
SamplerState SL;

//!COMMON
#define O(t, p) t.SampleLevel(SP, pos + p * pt, 0)
#define V4 min16float4
#define M4 min16float4x4

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t0;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R8G8B8A8_SNORM
Texture2D t1;

//!PASS 1
//!DESC in
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN INPUT
//!OUT t0

#define l0(x, y) min16float((dot(float3(2.666e-01, 5.050e-01, 1.135e-01), O(INPUT, float2(x, y)).rgb) + -8.258e-01))

V4 f0(min16float s0_0, min16float s0_1, min16float s0_2, min16float s0_3, min16float s0_4, min16float s0_5, min16float s0_6, min16float s0_7, min16float s0_8) {
	V4 r = 0.0;
	r += V4(-2.544e-02, -4.130e-01, -2.634e-01, 2.417e-02) * s0_0;
	r += V4(1.256e-02, -8.013e-02, 9.539e-02, -7.111e-02) * s0_1;
	r += V4(1.768e-02, -2.469e-01, -1.627e-01, 8.569e-02) * s0_2;
	r += V4(-1.554e-01, 3.441e-02, -1.508e-01, 2.491e-02) * s0_3;
	r += V4(1.628e-01, 8.679e-01, -1.960e-02, -5.810e-01) * s0_4;
	r += V4(-1.237e-02, -1.704e-01, 2.915e-01, -5.922e-01) * s0_5;
	r += V4(7.925e-01, 5.570e-03, 7.074e-02, 4.442e-04) * s0_6;
	r += V4(-7.910e-01, -1.530e-02, -8.229e-02, 3.149e-03) * s0_7;
	r += V4(-3.973e-03, 2.262e-02, -1.213e-01, 3.843e-02) * s0_8;
	r += V4(-8.495e-04, -1.121e-04, 1.842e-02, 5.844e-02);
	return r;
}

void Pass1(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	min16float s0_0 = l0(-1.0, -1.0);
	min16float s0_1 = l0(0.0, -1.0);
	min16float s0_2 = l0(1.0, -1.0);
	min16float s0_3 = l0(-1.0, 0.0);
	min16float s0_4 = l0(0.0, 0.0);
	min16float s0_5 = l0(1.0, 0.0);
	min16float s0_6 = l0(-1.0, 1.0);
	min16float s0_7 = l0(0.0, 1.0);
	min16float s0_8 = l0(1.0, 1.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8);
}

//!PASS 2
//!DESC conv1
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(4.254e-02, 1.997e-01, 4.636e-02, -4.800e-02, 2.043e-01, -4.096e-02, -7.212e-02, 1.408e-02, -3.916e-01, 2.630e-03, 7.016e-02, 9.613e-02, 1.773e-01, -2.723e-01, -9.458e-02, -1.890e-01));
	r += mul(s0_1, M4(2.350e-01, -8.474e-01, -4.044e-01, -9.188e-01, 9.560e-03, 5.061e-02, 1.092e-02, 1.781e-01, -2.144e-01, 3.203e-02, 6.349e-02, -8.272e-02, -3.105e-01, -3.917e-02, -1.320e-02, -1.541e-01));
	r += mul(s0_2, M4(-8.130e-01, -1.003e-01, 8.195e-02, -7.597e-01, 5.207e-02, 3.470e-02, -8.823e-03, -1.131e-01, -4.029e-02, 7.571e-02, -2.010e-01, 2.487e-01, 1.677e-01, -5.118e-02, -1.070e-01, 7.606e-02));
	r += mul(s0_3, M4(-1.158e-02, 4.898e-02, 1.202e-02, 5.012e-01, -5.343e-02, 4.756e-02, -2.438e-01, 6.399e-02, 2.822e-01, -2.863e-02, 1.996e-01, -7.099e-02, -1.323e-01, -3.797e-01, 5.385e-02, -1.014e-01));
	r += mul(s0_4, M4(2.812e-01, 7.903e-01, -1.733e-01, 6.668e-01, 4.775e-01, 5.452e-01, 7.089e-01, -1.851e-01, -2.382e-01, -5.180e-02, -3.623e-01, -3.040e-01, -4.313e-01, -1.167e-02, 1.235e-01, 1.436e-01));
	r += mul(s0_5, M4(-1.291e-01, -3.022e-02, -4.083e-01, -5.939e-02, -4.249e-01, -1.750e-01, 1.094e-01, -1.176e-01, 1.374e-02, 1.342e-01, 2.086e-01, 2.841e-01, 2.347e-01, 1.450e-01, 7.604e-02, 2.176e-01));
	r += mul(s0_6, M4(8.130e-02, -7.215e-02, -5.249e-02, 9.518e-03, -1.979e-01, -4.441e-02, -1.857e-01, -4.227e-01, 2.149e-01, -1.610e-01, 1.655e-01, -8.841e-02, 1.409e-01, -1.059e-01, 2.037e-01, -2.744e-03));
	r += mul(s0_7, M4(-7.266e-02, 1.638e-02, -1.639e-01, 1.957e-02, -2.857e-01, 1.936e-01, -1.243e-01, -1.490e-01, 1.525e-01, -8.934e-02, 7.415e-02, -1.779e-01, 1.648e-02, -6.456e-02, 7.053e-02, -9.530e-02));
	r += mul(s0_8, M4(-6.960e-02, -8.960e-02, -1.757e-02, -1.370e-01, -5.137e-01, -1.179e-01, -4.053e-01, -1.987e-01, 7.100e-02, 2.928e-02, -9.682e-02, 2.403e-01, 1.814e-01, 2.131e-02, 5.579e-02, 5.457e-02));
	r += mul(s1_0, M4(-2.737e-02, 5.272e-02, -1.801e-02, -2.491e-01, 2.871e-01, -3.704e-02, -6.568e-02, 2.905e-02, 1.011e-01, -3.782e-01, -8.696e-02, 4.682e-01, 3.233e-01, -3.060e-01, -3.251e-02, 1.165e+00));
	r += mul(s1_1, M4(-4.994e-01, 3.049e-02, -8.802e-02, -6.179e-02, 7.133e-02, -1.957e-02, -4.465e-02, 1.130e-01, 7.255e-02, 6.956e-03, -1.204e-01, 3.699e-01, -8.844e-02, 4.624e-01, -9.881e-02, -2.512e-01));
	r += mul(s1_2, M4(-3.645e-01, 1.274e-01, 2.387e-01, -1.963e-01, -5.995e-02, -5.943e-02, 9.694e-02, -2.518e-01, -2.797e-01, 1.598e-01, -1.371e-02, 4.000e-01, 2.213e-01, 9.692e-02, -3.302e-01, 1.132e+00));
	r += mul(s1_3, M4(-8.539e-03, -6.535e-02, 5.575e-02, 1.928e-01, 1.156e-01, 5.227e-02, -3.039e-01, 4.794e-01, 1.441e-01, 1.929e-01, -4.689e-02, 2.023e-02, 1.330e-01, -1.358e+00, -5.393e-01, 7.907e-01));
	r += mul(s1_4, M4(1.701e-01, -3.479e-02, 5.404e-01, -2.491e-01, 4.564e-01, 6.659e-01, 7.009e-01, -2.288e-02, -7.696e-01, -4.959e-01, 2.881e-01, -4.322e-01, -9.013e-01, -4.765e-01, 5.556e-02, -1.805e-01));
	r += mul(s1_5, M4(-2.424e-01, 8.034e-03, -4.699e-02, -2.628e-01, -4.682e-01, 2.977e-02, 2.258e-01, -1.419e-01, 3.514e-01, 6.860e-03, 2.147e-01, 3.806e-01, 3.747e-01, 1.403e-01, 3.106e-01, 9.680e-01));
	r += mul(s1_6, M4(1.776e-01, -4.873e-02, -1.403e-01, -1.817e-02, -3.551e-01, 4.838e-04, -2.786e-01, -6.048e-01, 3.082e-01, -4.703e-01, 2.419e-01, -3.002e-01, -4.310e-01, -6.490e-01, 1.343e+00, -1.019e+00));
	r += mul(s1_7, M4(4.689e-02, -2.927e-02, -7.494e-02, -3.516e-02, -2.217e-01, -3.189e-01, 2.202e-01, -2.936e-01, 4.772e-02, -1.609e-01, 9.853e-02, -4.214e-01, 2.780e-01, -1.073e-01, 1.102e-01, -2.033e-01));
	r += mul(s1_8, M4(-9.468e-02, 4.428e-02, 1.269e-01, -1.086e-01, -1.106e-01, -1.367e-01, -3.356e-01, 4.656e-03, 4.648e-02, -1.743e-02, -2.074e-01, -3.745e-02, 1.281e-01, -3.233e-01, 6.533e-01, 3.705e-01));
	r += V4(1.016e-03, 5.583e-03, -1.608e-02, -1.996e-04);
	return r;
}

void Pass2(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 3
//!DESC conv2
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-4.810e-02, 2.379e-02, -8.471e-02, 1.305e-01, -5.897e-02, 1.263e-01, -9.639e-02, 9.150e-02, 9.002e-03, -1.763e-01, 8.275e-02, -2.357e-01, 7.181e-02, -7.360e-02, 4.629e-02, -8.259e-02));
	r += mul(s0_1, M4(6.774e-02, 9.108e-02, -3.750e-01, 8.014e-02, 2.890e-01, 9.986e-02, -1.262e-02, -1.285e-01, -2.789e-01, -1.145e-01, -4.982e-02, -1.101e-01, -2.051e-02, -2.271e-01, 1.343e-01, -8.643e-02));
	r += mul(s0_2, M4(-5.433e-02, 6.899e-02, -3.350e-01, -7.837e-02, -1.076e-01, 1.912e-02, -9.061e-02, 1.919e-01, 9.387e-02, -4.206e-02, 1.861e-01, -4.416e-03, -1.560e-01, -4.364e-02, 4.364e-01, 8.765e-02));
	r += mul(s0_3, M4(2.382e-01, 3.032e-01, -1.313e-01, -1.154e-01, 1.008e-01, 3.058e-01, -8.513e-02, 2.713e-01, -9.875e-02, 3.017e-01, 3.203e-02, 5.762e-01, -2.056e-03, -7.698e-02, 8.681e-02, 4.245e-02));
	r += mul(s0_4, M4(2.643e-01, 1.750e-01, 4.850e-02, 3.131e-03, 2.785e-01, 1.598e-01, 5.772e-01, -4.118e-04, -4.270e-01, -2.447e-01, 4.486e-01, 9.155e-02, -3.428e-01, -2.583e-01, -3.721e-02, 6.278e-02));
	r += mul(s0_5, M4(-1.080e-01, -5.514e-02, -3.648e-01, -2.319e-02, -2.100e-01, -4.065e-02, 1.126e-01, 3.970e-02, 9.824e-02, 1.377e-02, 1.295e-01, -2.512e-02, 1.115e-01, 7.094e-02, 3.413e-01, -5.245e-02));
	r += mul(s0_6, M4(1.991e-01, 4.710e-02, -9.305e-02, -1.471e-01, -8.221e-02, 1.134e-01, -1.718e-01, -2.606e-01, -8.167e-02, -1.462e-02, -1.094e-01, -1.569e-01, 2.133e-02, 3.374e-02, 4.583e-02, 1.228e-01));
	r += mul(s0_7, M4(-2.135e-01, 6.874e-02, -4.993e-02, 1.156e-02, -4.261e-01, 1.366e-01, 4.250e-02, -5.707e-02, -1.966e-01, -6.106e-02, 1.265e-01, -3.076e-03, 2.043e-03, -3.072e-02, 1.043e-01, 3.422e-01));
	r += mul(s0_8, M4(7.235e-02, -3.542e-04, -1.435e-02, -3.815e-02, -8.855e-02, 8.327e-02, 1.954e-01, 1.462e-01, 1.615e-01, -4.957e-02, 1.596e-02, -8.625e-02, 6.574e-02, -9.799e-02, 5.401e-03, 7.595e-02));
	r += mul(s1_0, M4(1.245e-01, -2.812e-03, 1.486e-02, 1.246e-01, -5.943e-02, 1.170e-01, -1.068e-01, 8.960e-02, 5.354e-03, -2.039e-01, 8.228e-02, -2.530e-01, -2.789e-03, -6.932e-02, -3.187e-02, -5.794e-02));
	r += mul(s1_1, M4(-2.539e-02, 4.598e-02, -1.205e-01, 1.597e-01, 2.391e-01, 1.269e-01, -1.116e-02, 1.498e-02, -2.388e-01, -1.548e-01, -7.389e-02, -1.083e-02, -1.181e-01, -7.069e-02, 9.383e-03, -2.018e-01));
	r += mul(s1_2, M4(-1.248e-02, 3.267e-02, -2.761e-01, -2.043e-02, -8.520e-02, 3.937e-02, -1.372e-01, 1.821e-02, 6.915e-02, -4.061e-02, 1.782e-01, -4.619e-02, 6.811e-02, -5.458e-04, 3.193e-01, 8.892e-03));
	r += mul(s1_3, M4(-1.580e-01, 7.536e-02, -6.680e-02, 1.891e-01, 1.196e-01, 3.476e-01, -6.321e-02, 1.972e-01, -9.851e-02, 4.483e-01, 9.326e-03, 5.272e-01, -1.478e-01, -4.009e-02, -3.561e-02, -2.549e-01));
	r += mul(s1_4, M4(-1.253e-01, 1.345e-01, 4.994e-01, 2.000e-01, 2.728e-01, 1.672e-01, 5.501e-01, -1.736e-02, -5.782e-01, -2.191e-01, 4.380e-01, 4.346e-02, -3.006e-01, -5.220e-02, -1.613e-01, 6.023e-02));
	r += mul(s1_5, M4(1.276e-01, -8.319e-02, -2.115e-01, 1.471e-01, -1.669e-01, -2.484e-02, 9.906e-02, 1.836e-02, 1.010e-01, 1.847e-02, 1.027e-01, -1.680e-02, -1.880e-01, 1.377e-01, 3.823e-02, -8.256e-02));
	r += mul(s1_6, M4(-3.200e-01, -7.023e-02, -1.243e-01, -2.003e-02, -7.863e-02, 6.650e-02, -1.264e-01, -1.862e-01, -9.119e-02, -4.374e-02, -1.195e-01, -6.902e-02, -1.360e-01, 3.356e-02, -3.667e-02, -1.815e-01));
	r += mul(s1_7, M4(1.462e-02, 1.001e-01, 2.453e-01, -1.298e-02, -4.372e-01, 1.509e-01, 8.011e-02, -1.323e-01, -1.980e-01, -4.785e-02, 1.733e-01, 1.100e-02, -2.153e-01, 6.711e-02, 2.595e-03, 1.213e-01));
	r += mul(s1_8, M4(-3.794e-03, 2.239e-02, -6.960e-02, 7.342e-02, -1.882e-01, 1.159e-01, 1.876e-01, 3.125e-02, 2.242e-01, -5.956e-02, 1.328e-02, -5.400e-02, 2.205e-02, -6.049e-02, -9.151e-02, -1.137e-01));
	r += V4(-1.437e-02, -2.276e-02, 2.275e-02, 6.547e-04);
	return r;
}

void Pass3(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 4
//!DESC conv3
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(3.886e-03, -1.503e-01, -6.378e-01, 4.214e-02, -1.255e-01, 1.146e-01, -1.917e-01, -6.556e-02, -3.368e-02, 6.874e-02, 2.796e-01, -2.936e-02, -3.239e-02, 3.923e-02, -6.439e-02, 1.313e-02));
	r += mul(s0_1, M4(4.357e-01, -1.067e-01, 3.330e-01, -8.295e-02, -4.004e-01, 3.113e-01, -4.222e-02, 2.290e-01, -1.861e-01, 9.039e-02, -1.132e-01, 1.077e-01, -1.603e-02, 6.296e-02, 4.907e-01, 3.396e-02));
	r += mul(s0_2, M4(-3.290e-01, -1.073e-01, 1.064e-02, -2.792e-03, -4.366e-01, 3.239e-01, -1.383e-01, 1.918e-01, 3.058e-02, 1.006e-01, -6.898e-02, -1.451e-02, -1.882e-01, 2.248e-01, 1.744e-02, -3.155e-02));
	r += mul(s0_3, M4(2.403e-02, -1.353e-01, 1.895e-01, -2.285e-01, -1.211e-01, 1.771e-01, 2.135e-01, 1.900e-01, -4.204e-03, 3.719e-02, -4.772e-01, 2.006e-01, -2.532e-03, 5.872e-02, 2.901e-01, -9.450e-02));
	r += mul(s0_4, M4(8.054e-02, 1.389e-02, -2.060e-02, -3.042e-01, -2.476e-01, 9.905e-02, -9.248e-01, 3.372e-01, -5.254e-01, 4.455e-01, 5.707e-02, 1.057e-01, -3.525e-01, 3.349e-01, -3.414e-01, 7.090e-02));
	r += mul(s0_5, M4(-1.889e-01, -2.290e-01, -4.930e-02, -1.824e-01, -2.062e+00, 6.868e-02, 2.552e-01, 3.883e-01, 5.778e-02, 9.141e-02, 9.917e-02, -1.164e-01, 4.359e-02, 2.105e-01, -7.911e-02, -1.916e-01));
	r += mul(s0_6, M4(-2.267e-02, -6.231e-03, -9.718e-03, 3.770e-04, -6.982e-02, 4.184e-02, -2.296e-01, -9.542e-02, 5.236e-02, -5.412e-02, -1.757e-01, -1.054e-01, 1.414e-02, -7.772e-02, -1.338e-02, 3.928e-02));
	r += mul(s0_7, M4(5.776e-02, 4.703e-02, 3.914e-02, -1.617e-02, -3.606e-01, 3.037e-01, -3.096e-01, 3.562e-02, 3.108e-01, -3.684e-01, 3.725e-02, -2.050e-01, -1.494e-02, 8.741e-02, 5.992e-02, 2.655e-02));
	r += mul(s0_8, M4(3.614e-02, -1.212e-01, 2.507e-02, -5.858e-02, -1.121e-01, -3.433e-01, 6.613e-02, -6.943e-01, 2.233e-02, -5.467e-02, -6.900e-03, -2.566e-01, -1.106e-01, 2.016e-02, -3.700e-02, -2.886e-01));
	r += mul(s1_0, M4(-5.136e-02, -2.190e-01, -1.035e+00, -5.722e-02, 2.876e-02, 5.070e-02, 3.532e-01, -6.778e-03, 2.930e-04, -6.219e-02, 2.314e-01, -5.210e-02, 1.508e-02, -4.390e-02, -7.749e-02, -9.658e-03));
	r += mul(s1_1, M4(3.663e-01, -9.746e-02, -6.582e-01, -3.676e-01, -1.694e-01, 7.883e-02, -1.613e-01, 2.328e-02, 2.595e-04, -3.763e-02, -9.946e-02, -6.137e-02, 1.429e-01, -1.964e-01, 2.439e-01, 4.898e-02));
	r += mul(s1_2, M4(7.884e-02, 1.842e-01, -1.309e-01, 4.895e-02, 4.820e-02, 8.364e-02, 1.189e-02, -1.438e-02, -7.934e-02, 4.775e-02, -6.137e-02, -1.335e-02, -4.416e-02, 3.584e-02, 1.751e-04, -1.178e-02));
	r += mul(s1_3, M4(-9.861e-03, -1.277e-01, 2.389e-03, -3.232e-01, -2.782e-03, 1.115e-01, -6.485e-02, 2.093e-01, 2.056e-01, 2.527e-02, -1.772e-01, 1.863e-02, 5.983e-02, -8.103e-02, 3.076e-01, -2.027e-01));
	r += mul(s1_4, M4(1.001e-01, 3.476e-01, -1.305e-01, -1.653e-01, 8.890e-02, -4.170e-01, -1.530e-01, 7.048e-02, -5.605e-01, 1.093e-01, 2.038e-01, -2.320e-01, -1.287e-01, -2.173e-01, -1.630e-01, -9.691e-02));
	r += mul(s1_5, M4(-2.778e-01, 1.393e-01, -2.802e-02, -5.375e-02, -4.550e-01, -1.661e-01, 2.293e-03, -5.984e-02, -5.070e-02, -8.852e-02, 7.806e-02, 2.187e-02, 1.901e-01, -3.219e-01, -1.937e-01, -2.336e-01));
	r += mul(s1_6, M4(-8.489e-02, 1.968e-01, -7.760e-02, 1.388e-01, 4.713e-03, 1.527e-01, 8.535e-02, 1.643e-02, 1.429e-01, -1.558e-01, 2.339e-01, 2.762e-01, 1.694e-02, -4.245e-02, -2.793e-02, -3.332e-02));
	r += mul(s1_7, M4(-4.377e-02, 3.486e-01, -1.766e-01, -1.065e-01, -1.645e-01, -8.722e-04, -1.147e-01, 1.663e-01, 6.801e-02, -3.539e-01, 1.560e-02, -1.819e-01, 1.440e-02, -1.221e-02, 3.693e-02, 5.886e-03));
	r += mul(s1_8, M4(5.940e-02, 1.624e-01, 1.526e-02, 6.692e-02, 1.812e-01, -8.647e-02, 3.210e-02, -3.751e-04, 2.884e-02, -4.717e-02, 4.121e-03, 5.144e-02, -1.995e-02, -2.827e-01, 6.148e-03, 7.209e-02));
	r += V4(1.575e-02, -2.007e-01, -3.519e-03, -9.082e-03);
	return r;
}

void Pass4(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 5
//!DESC conv4
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(-6.479e-02, -9.976e-02, -1.507e-01, -9.934e-02, -1.046e-02, -1.471e-01, -4.218e-02, -8.348e-04, -5.963e-02, 1.519e-03, 5.897e-03, 5.284e-02, -4.467e-01, 4.779e-01, -1.953e-02, 1.951e-01));
	r += mul(s0_1, M4(-5.276e-02, -1.201e-01, -1.160e-01, 6.076e-02, -4.798e-02, -3.491e-01, -3.055e-01, -1.607e-01, -8.989e-02, 1.221e-01, -1.561e-01, 6.227e-02, -1.598e-01, -6.666e-01, 6.029e-01, -5.466e-01));
	r += mul(s0_2, M4(-1.331e-01, -4.988e-02, -2.217e-02, 3.405e-02, 2.261e-02, 1.352e-01, 1.124e-02, 8.259e-02, -3.548e-02, 2.454e-01, 4.417e-02, 2.297e-01, 1.780e-01, -2.203e-01, 5.913e-02, -2.201e-01));
	r += mul(s0_3, M4(1.348e-01, 5.544e-01, -4.335e-01, -3.619e-01, 1.011e-01, 2.665e-01, -2.627e-01, -1.800e-01, -1.158e-01, -8.543e-02, -7.868e-03, 2.056e-01, 1.988e-01, 1.174e+00, -1.291e-01, 1.131e-01));
	r += mul(s0_4, M4(4.504e-01, 1.025e-01, -1.449e-01, -3.442e-02, -4.525e-01, -1.513e-01, -8.135e-02, -9.669e-02, -3.287e-01, 5.251e-01, -6.540e-01, 7.386e-02, 2.603e-01, -8.246e-01, -1.378e-01, 2.363e+00));
	r += mul(s0_5, M4(-7.102e-02, -5.554e-02, -3.489e-02, -6.688e-02, 2.877e-01, -6.258e-02, 8.515e-02, -2.109e-01, -2.723e-01, 1.543e-01, 1.285e-01, 9.366e-02, 3.135e-02, -3.700e-01, -4.111e-01, 1.822e+00));
	r += mul(s0_6, M4(-4.018e-02, -3.412e-01, 5.388e-02, 4.947e-01, -3.234e-02, -6.778e-02, 3.825e-02, 1.313e-01, -6.083e-02, 3.439e-02, -1.081e-01, 6.456e-02, 2.287e-02, -2.470e-01, 2.026e-02, -1.886e-02));
	r += mul(s0_7, M4(2.410e-01, 1.529e-01, -1.370e-01, -1.389e-01, 1.549e-01, 8.308e-03, 3.064e-02, 3.925e-02, -9.013e-02, 1.131e-01, -9.240e-02, 3.740e-01, -1.009e-01, -6.576e-02, -1.491e-01, -3.452e-02));
	r += mul(s0_8, M4(-1.628e-01, -2.480e-02, -6.569e-02, 3.873e-02, 1.604e-02, 1.651e-02, -4.681e-02, -1.647e-02, -1.648e-02, 1.541e-01, 2.284e-02, 6.545e-01, 1.799e-03, 1.193e-03, -1.215e-01, 5.919e-02));
	r += mul(s1_0, M4(-1.115e-02, -5.014e-02, -1.499e-01, -7.414e-04, -6.944e-02, -4.168e-02, -1.254e-01, -6.576e-02, 2.946e-04, -2.669e-02, 4.109e-02, 1.949e-02, 1.242e-01, 1.753e-01, 9.717e-02, 1.446e-01));
	r += mul(s1_1, M4(-1.327e-02, -1.462e-01, -8.510e-02, -1.228e-02, 1.772e-01, 1.009e-01, -4.342e-02, -8.827e-02, -6.663e-02, -1.245e-01, -4.625e-02, -4.285e-02, 7.586e-02, -1.208e-01, 2.705e-01, -1.558e-01));
	r += mul(s1_2, M4(-7.024e-02, -3.045e-02, -1.916e-02, 4.979e-02, -9.145e-02, 2.285e-01, 4.612e-02, 2.217e-01, 7.690e-02, -4.332e-02, 6.032e-03, -2.370e-02, 3.802e-01, -8.124e-02, 1.982e-02, -8.310e-02));
	r += mul(s1_3, M4(1.238e-01, 5.787e-01, -5.332e-01, -2.806e-01, 1.208e-01, 6.549e-02, -2.040e-01, -2.578e-02, -5.878e-02, -1.496e-01, 1.213e-01, 1.489e-02, 9.569e-02, 1.964e-01, 6.477e-02, -2.939e-01));
	r += mul(s1_4, M4(5.825e-01, 2.257e-01, -1.943e-01, 1.101e-01, -3.240e-01, -2.967e-01, -4.203e-02, -3.636e-01, -1.062e-01, -3.799e-02, -4.444e-01, -7.607e-02, -3.056e-01, -2.926e-01, -4.582e-02, 2.795e-01));
	r += mul(s1_5, M4(-9.076e-02, -5.130e-02, -3.718e-02, -6.163e-02, 1.831e-01, -1.199e-01, 9.176e-02, -2.456e-01, 2.362e-01, -1.854e-01, -1.394e-01, 3.560e-03, 2.070e-02, -6.903e-02, -5.061e-02, 3.068e-02));
	r += mul(s1_6, M4(-4.988e-02, -3.880e-01, 3.001e-02, 3.892e-01, -2.827e-02, -2.880e-02, 4.071e-02, 2.861e-01, -4.016e-02, -1.085e-01, 9.207e-03, -7.367e-02, 9.072e-03, 8.960e-02, 5.334e-03, -6.480e-02));
	r += mul(s1_7, M4(2.900e-01, 1.450e-01, -1.401e-01, -2.809e-01, 1.218e-01, -3.153e-03, -2.544e-02, 1.898e-01, -7.197e-02, -3.721e-01, 4.042e-02, 9.918e-02, -1.132e-01, 3.578e-02, 4.000e-02, 6.991e-02));
	r += mul(s1_8, M4(-1.493e-01, -2.310e-02, -6.133e-02, 5.322e-02, -4.879e-02, -5.139e-02, -8.058e-02, 4.140e-02, 2.511e-01, 3.669e-02, -1.003e-01, -1.457e-01, 1.528e-01, 1.177e-01, 6.665e-02, -3.084e-02));
	r += V4(2.513e-04, -2.994e-02, -5.133e-02, -8.977e-03);
	return r;
}

void Pass5(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 6
//!DESC conv5
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(4.575e-01, 2.412e-01, 1.926e-01, 5.873e-02, 2.954e-02, -1.424e-01, 7.881e-03, 2.358e-04, -5.872e-02, -1.007e-01, -3.632e-02, 5.718e-02, 1.389e-01, -4.163e-02, -1.379e-01, 2.160e-03));
	r += mul(s0_1, M4(1.347e-01, -8.074e-01, -1.155e-01, 2.242e-01, -2.673e-01, 4.053e-01, 8.867e-02, -2.840e-02, 9.443e-02, 2.632e-01, 9.207e-02, -1.793e-02, 1.519e-01, 3.302e-03, 2.027e-01, 2.643e-02));
	r += mul(s0_2, M4(1.462e-02, -7.543e-02, -6.080e-02, 7.431e-02, -3.673e-02, -1.665e-01, -2.745e-01, -4.416e-02, -3.270e-01, 7.677e-01, 7.241e-01, -1.157e-01, -8.204e-03, 2.172e-02, 3.183e-01, 3.931e-02));
	r += mul(s0_3, M4(1.168e+00, -8.427e-01, -3.237e-03, 5.416e-02, 1.694e-02, -1.042e-01, -2.173e-01, -1.089e-01, -9.881e-02, -1.109e-01, -1.003e-01, -5.080e-02, -9.279e-02, -1.111e-01, -2.699e-02, -2.297e-02));
	r += mul(s0_4, M4(-4.884e-01, -4.472e-01, -9.701e-02, 8.789e-01, 1.962e-02, 5.041e-01, 3.221e-01, -4.622e-02, 9.039e-02, -2.531e-01, 6.228e-01, 1.590e-02, 1.804e-02, 7.795e-02, -8.005e-02, -6.310e-03));
	r += mul(s0_5, M4(-6.567e-02, -5.161e-02, 5.550e-02, 5.285e-02, -6.147e-02, -1.840e-01, 2.028e-01, 4.014e-01, 4.070e-01, -1.022e-01, 1.414e+00, -3.126e-01, 7.508e-03, 1.013e-01, -7.300e-02, -4.282e-01));
	r += mul(s0_6, M4(1.721e+00, 1.776e-01, -8.690e-02, -1.102e-01, -8.467e-02, -2.165e-02, 6.238e-02, 2.052e-02, 2.763e-01, -3.472e-02, -1.179e-01, 2.993e-02, -6.860e-02, 1.887e-02, 3.140e-02, -6.853e-02));
	r += mul(s0_7, M4(1.937e-01, 1.975e-01, -2.456e-01, -1.360e+00, 1.792e-01, -5.969e-02, -7.670e-02, 2.606e-01, 1.355e-01, -9.109e-03, 2.756e-01, 6.674e-02, 1.312e-02, -1.542e-02, 2.236e-02, 1.997e-01));
	r += mul(s0_8, M4(4.255e-02, -1.452e-02, -8.732e-02, -1.084e-01, 1.495e-02, 1.302e-02, -9.151e-02, -2.814e-01, 5.197e-02, 2.866e-02, 5.490e-01, 4.310e-01, 3.666e-02, -3.380e-03, -2.830e-02, -8.223e-02));
	r += mul(s1_0, M4(2.549e-02, 7.469e-02, -5.290e-02, -4.972e-02, -2.340e-01, -1.875e-01, 1.656e-01, 5.697e-02, -8.570e-02, -1.520e-01, -2.622e-02, 1.043e-02, -2.377e-01, -3.927e-02, 1.539e-01, 4.528e-02));
	r += mul(s1_1, M4(-1.188e-02, -9.781e-02, 1.606e-01, 5.138e-02, -4.165e-01, 8.262e-01, 1.709e-01, -1.063e-01, 8.393e-03, 7.300e-02, -9.347e-02, -6.226e-02, -3.633e-01, -4.453e-01, 2.190e-01, 2.415e-01));
	r += mul(s1_2, M4(-4.011e-02, 3.404e-02, 1.013e-01, 3.551e-02, 9.692e-02, -2.109e-01, 1.897e-01, -2.192e-01, -1.703e-01, 5.317e-01, 1.354e-01, -2.027e-01, -3.658e-01, -1.845e-01, -5.465e-01, 1.436e-01));
	r += mul(s1_3, M4(7.674e-01, 1.677e-01, -7.875e-02, 7.537e-03, -4.911e-01, -1.083e-01, 7.183e-03, -1.107e-01, -2.514e-02, -1.257e-01, -5.070e-02, -3.886e-02, 1.368e-01, -1.991e-02, -1.698e-01, -7.850e-03));
	r += mul(s1_4, M4(-5.096e-02, 7.912e-02, -2.105e-01, 1.149e-01, 9.798e-02, 2.243e-01, -3.434e-01, 3.492e-01, -1.265e-01, -1.839e-01, -1.337e-01, -6.909e-02, -8.552e-01, 1.334e-01, 8.652e-01, -3.408e-01));
	r += mul(s1_5, M4(-2.933e-02, 1.424e-01, 6.542e-02, -1.710e-01, -1.459e-01, -3.069e-02, -1.275e-01, -9.443e-02, 2.657e-01, -4.784e-04, -6.729e-03, -1.910e-01, -4.628e-01, 3.808e-02, -1.470e-01, 1.480e-01));
	r += mul(s1_6, M4(1.512e-01, -1.755e-02, -5.440e-02, 1.317e-02, -7.181e-02, -6.842e-03, -7.375e-02, -8.356e-02, 7.332e-02, -9.437e-02, -1.008e-01, -4.731e-02, -9.102e-02, -8.192e-03, 7.862e-04, 6.417e-02));
	r += mul(s1_7, M4(2.457e-01, -1.058e-01, -2.777e-02, -1.532e-03, 7.609e-02, 3.452e-02, 1.774e-01, 3.296e-01, 6.779e-02, -6.683e-02, 1.485e-01, 7.321e-02, -3.082e-02, -4.348e-02, 3.558e-03, 9.111e-03));
	r += mul(s1_8, M4(1.104e-01, 5.040e-03, 9.642e-03, -8.991e-02, -2.134e-01, 3.758e-02, -1.244e-01, -1.987e-01, -7.007e-02, 6.792e-03, 1.369e-01, 5.332e-01, -5.354e-02, -2.024e-02, -1.038e-01, -4.812e-02));
	r += V4(4.102e-03, 1.192e-03, -2.598e-03, -2.812e-03);
	return r;
}

void Pass6(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 7
//!DESC conv6
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(6.200e-02, 5.385e-02, -5.478e-02, 3.955e-02, -1.722e-02, -1.194e-01, 8.331e-02, -9.296e-02, -2.161e-02, 8.716e-02, -5.918e-02, 1.032e-01, 4.954e-02, -3.822e-02, 8.472e-02, -2.191e-01));
	r += mul(s0_1, M4(2.503e-01, 5.635e-02, 7.355e-03, -2.025e-01, 7.104e-02, -1.324e-01, -3.051e-02, 2.246e-02, -4.480e-02, 6.693e-03, 4.467e-02, 3.388e-02, 4.262e-01, 1.488e-01, -8.809e-01, 5.350e-01));
	r += mul(s0_2, M4(-7.511e-03, 1.921e-01, -3.653e-01, 2.096e-02, 2.413e-02, 4.846e-02, -1.538e-01, 3.359e-02, 5.958e-03, -1.033e-02, 2.389e-02, 1.283e-02, -5.270e-02, 2.842e-01, 5.681e-02, -3.578e-02));
	r += mul(s0_3, M4(-2.198e-02, -1.674e-02, 3.330e-02, 3.249e-02, -4.430e-02, 9.217e-02, -3.348e-02, -3.546e-01, 1.228e-01, 3.875e-02, 7.220e-03, 6.719e-02, -8.768e-01, -1.165e-02, -3.862e-02, -2.045e-02));
	r += mul(s0_4, M4(-6.935e-01, -4.898e-01, 2.252e-01, -1.647e-01, -6.408e-02, 4.562e-01, -6.617e-01, 1.220e-01, 1.053e-02, -9.937e-02, -1.118e-02, 3.272e-01, -9.081e-02, 2.353e-02, 4.776e-01, -1.238e-01));
	r += mul(s0_5, M4(2.481e-01, -3.296e-01, -3.372e-02, -2.008e-02, 5.924e-03, 1.762e-02, 3.642e-01, -1.182e-01, -2.219e-02, -4.332e-02, -9.762e-02, 3.537e-02, 2.114e-02, -5.440e-02, 3.124e-01, 5.069e-02));
	r += mul(s0_6, M4(-5.465e-02, -5.352e-03, -3.419e-03, -6.733e-02, -8.079e-02, -6.569e-02, -1.494e-02, -3.462e-01, -8.125e-03, 2.572e-03, -3.894e-02, -3.246e-02, -1.566e-02, -3.004e-02, 1.145e-01, 6.794e-02));
	r += mul(s0_7, M4(4.788e-02, 7.675e-03, -7.030e-02, -2.384e-02, -3.070e-01, -7.080e-01, -2.017e-01, 9.579e-02, 1.259e-01, -1.004e-02, -1.287e-01, 3.334e-02, -9.642e-02, -8.073e-02, 2.546e-02, 5.204e-02));
	r += mul(s0_8, M4(-6.015e-02, 1.650e-01, -5.471e-02, -1.454e-01, -2.785e-02, -1.831e-01, 1.123e-01, 3.453e-02, -1.179e-02, 1.722e-02, -1.068e-02, -2.608e-02, 1.514e-04, -1.287e-02, -7.741e-03, -9.765e-03));
	r += mul(s1_0, M4(-4.922e-02, -5.675e-03, -2.161e-02, 3.164e-02, -2.003e-02, -3.890e-02, 5.198e-02, -1.811e-03, -3.385e-02, -1.510e-02, -2.289e-02, 1.009e-01, 4.427e-02, -1.763e-01, 1.255e-01, -5.073e-02));
	r += mul(s1_1, M4(1.057e-01, -8.124e-02, 1.131e-01, -1.361e-01, 4.740e-02, -6.425e-02, 8.930e-03, 5.318e-02, 5.266e-02, -6.003e-02, 1.320e-01, 4.163e-02, 1.277e-01, -2.404e-01, -1.696e-01, 2.204e-01));
	r += mul(s1_2, M4(2.723e-02, 1.918e-01, -2.822e-01, -1.877e-02, -4.599e-03, 7.591e-02, -1.128e-01, -6.519e-03, 2.311e-02, -1.684e-01, 2.293e-01, -1.042e-01, -1.882e-02, 4.970e-02, -1.309e-01, -8.894e-03));
	r += mul(s1_3, M4(4.883e-02, 2.819e-02, 4.318e-02, 3.186e-02, 7.782e-02, 1.741e-01, -8.927e-02, 4.005e-02, 5.888e-02, -1.057e-01, 9.692e-02, 8.032e-02, -1.086e-01, 6.323e-02, -8.520e-02, -1.273e-02));
	r += mul(s1_4, M4(-1.746e-01, -2.834e-02, -3.694e-02, 3.226e-01, -2.541e-01, 6.860e-01, -1.436e-01, 1.705e-01, 2.614e-01, -6.751e-02, 5.646e-02, 3.666e-01, -2.621e-02, 4.951e-01, -1.090e-01, -3.168e-01));
	r += mul(s1_5, M4(1.513e-01, 5.210e-02, 2.625e-01, -6.303e-02, -2.252e-02, -9.485e-02, 4.776e-01, -1.789e-01, -1.291e-01, -9.714e-02, -1.427e-01, -1.165e-01, 2.415e-02, 9.790e-02, 6.024e-02, -9.622e-02));
	r += mul(s1_6, M4(3.751e-02, -2.907e-02, -1.762e-02, -9.545e-02, 2.866e-01, -7.329e-02, -9.787e-03, 4.513e-03, -9.486e-02, -2.446e-02, -2.357e-02, -5.002e-02, 4.973e-02, 6.256e-02, -2.532e-02, -1.817e-02));
	r += mul(s1_7, M4(-6.855e-02, -6.762e-02, -6.269e-02, -6.947e-02, -1.389e-01, -1.915e-01, -4.806e-02, 1.870e-01, 1.298e-01, 6.268e-03, -5.985e-02, -5.396e-02, -3.048e-02, -5.396e-03, -9.720e-02, 3.289e-03));
	r += mul(s1_8, M4(-2.052e-02, -8.106e-02, -1.721e-02, 9.911e-03, -8.521e-02, 4.832e-02, -1.708e-01, -6.445e-02, -9.788e-02, 8.836e-02, -1.204e-01, -1.123e-01, 1.514e-02, 1.628e-02, -5.003e-02, -6.128e-03));
	r += V4(1.448e-03, -2.432e-03, -8.004e-04, 5.896e-05);
	return r;
}

void Pass7(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 8
//!DESC conv7
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t0
//!OUT t1

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(5.901e-02, -1.033e-01, -1.441e-01, 4.291e-02, 2.355e-02, -1.199e-01, -1.741e-01, -5.263e-03, -6.030e-03, -4.043e-02, 1.910e-01, 8.326e-03, 2.913e-02, 1.969e-02, -1.380e-01, 9.492e-02));
	r += mul(s0_1, M4(-1.616e-01, 1.649e-01, -1.133e-02, -1.037e-01, -1.060e-02, 2.299e-01, -5.302e-02, -2.329e-01, -8.540e-02, 2.232e-01, 2.647e-01, 3.922e-01, 5.387e-02, 5.841e-01, -1.264e-01, -1.440e-01));
	r += mul(s0_2, M4(-1.944e-02, -7.262e-02, 9.583e-02, 3.448e-02, 4.402e-02, 5.319e-02, -2.384e-02, 4.652e-02, 6.280e-02, -4.195e-02, 1.573e-02, 7.059e-02, 1.029e-01, -1.784e-02, -3.735e-02, -4.952e-02));
	r += mul(s0_3, M4(7.393e-02, -1.825e-01, -2.983e-01, -5.798e-02, -2.475e-01, -4.958e-02, 6.660e-01, -2.202e-01, -9.158e-02, 4.280e-04, 2.472e-01, -2.979e-01, -9.887e-02, 6.188e-02, 2.163e-01, -9.358e-03));
	r += mul(s0_4, M4(-8.664e-01, 2.357e-01, 3.390e-01, -5.275e-01, -2.213e-01, -4.992e-01, 5.479e-01, 4.245e-01, -7.542e-02, 4.854e-01, -3.525e-01, 3.950e-01, 3.619e-01, -3.968e-01, -3.447e-01, 5.089e-01));
	r += mul(s0_5, M4(-9.239e-02, -6.370e-01, -7.252e-02, -3.435e-01, -1.057e-01, 1.616e-01, -4.413e-02, 1.824e-01, 2.001e-02, -1.343e-01, -5.730e-02, 7.302e-02, -2.361e-02, -9.044e-02, -1.041e-01, 2.971e-01));
	r += mul(s0_6, M4(-2.803e-02, -8.707e-02, -1.407e-01, -2.685e-02, 1.099e-01, 1.721e-01, 1.612e-01, 6.962e-02, -1.659e-02, 7.845e-02, 2.165e-01, -7.067e-02, 1.666e-02, 7.051e-02, 6.373e-02, 4.391e-02));
	r += mul(s0_7, M4(-1.560e-01, -2.698e-02, -5.684e-01, -1.184e-01, 7.742e-01, -1.023e-03, -8.177e-02, 2.857e-01, 2.253e-02, -1.400e-02, -6.523e-02, 7.644e-02, 1.789e-01, -8.433e-03, 1.041e-01, 7.009e-02));
	r += mul(s0_8, M4(-1.491e-01, -2.037e-01, -2.499e-01, -7.730e-02, 1.051e-01, -1.718e-02, -1.762e-01, 4.808e-02, -3.068e-03, 1.737e-02, -3.772e-04, 4.732e-02, 7.205e-02, 7.901e-02, -1.759e-02, 8.476e-02));
	r += mul(s1_0, M4(4.810e-02, -1.822e-02, -1.150e-01, -1.679e-02, -5.481e-02, -7.544e-02, 2.213e-01, 2.615e-02, -2.628e-03, -1.482e-01, -5.570e-02, 5.137e-02, -1.381e-02, -1.878e-03, -3.132e-02, -3.309e-02));
	r += mul(s1_1, M4(1.101e-01, 1.003e-01, -4.307e-01, -2.520e-02, 1.138e-02, -1.966e-01, 6.664e-02, 1.114e-01, -1.431e-01, 3.634e-01, 4.274e-02, -8.279e-02, -5.291e-02, 3.540e-01, 8.995e-02, -1.401e-01));
	r += mul(s1_2, M4(7.230e-02, 4.684e-01, -6.542e-02, -2.792e-01, 2.936e-02, 3.476e-03, -1.024e-02, 1.880e-01, 1.898e-02, 2.529e-02, 8.537e-03, -6.073e-03, 1.025e-01, -2.320e-01, -1.804e-02, 5.471e-02));
	r += mul(s1_3, M4(-9.258e-03, -7.731e-03, 4.285e-02, -4.725e-02, -3.878e-02, -1.749e-02, -1.681e-02, -1.020e-01, -3.975e-02, 1.609e-02, 8.299e-02, -1.824e-01, -2.500e-02, 3.516e-02, 8.591e-02, 1.714e-02));
	r += mul(s1_4, M4(-2.210e-01, 1.534e-01, 3.410e-01, -2.552e-01, -5.090e-02, 1.582e-02, 1.802e-01, -1.333e-01, -5.371e-01, 3.751e-01, -1.323e-01, 3.018e-01, 1.756e-01, -9.756e-02, -4.873e-01, 4.985e-01));
	r += mul(s1_5, M4(-1.073e-02, 2.919e-01, -2.025e-01, 3.240e-01, 4.318e-02, -1.972e-02, -1.612e-01, 3.528e-01, -6.472e-02, -6.212e-02, 3.146e-02, 6.391e-02, 4.950e-02, -6.270e-01, -1.985e-02, 4.680e-02));
	r += mul(s1_6, M4(-2.215e-02, 1.836e-02, 5.021e-02, -3.016e-02, -7.854e-03, 1.135e-02, 3.407e-02, -2.923e-02, -5.384e-03, 6.570e-02, 2.437e-01, -8.712e-02, 2.275e-02, -2.291e-03, -7.378e-02, 5.231e-02));
	r += mul(s1_7, M4(-4.186e-02, 6.944e-02, 8.353e-02, -1.927e-02, 3.937e-02, 2.105e-02, 7.152e-02, 5.635e-03, 1.114e-01, -3.772e-02, -1.853e-01, 6.636e-02, 4.654e-02, -1.008e-01, -1.625e-01, 7.888e-02));
	r += mul(s1_8, M4(5.288e-02, -5.516e-02, -4.014e-02, 8.854e-02, 2.434e-02, 9.192e-02, -1.203e-02, 6.813e-02, 4.626e-02, -4.892e-02, 4.700e-03, 7.578e-02, -5.040e-02, 3.497e-02, 3.176e-02, -9.741e-02));
	r += V4(2.671e-03, -5.536e-03, -4.013e-03, 4.378e-03);
	return r;
}

void Pass8(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t1[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 9
//!DESC conv8
//!BLOCK_SIZE 8
//!NUM_THREADS 64
//!IN t1
//!OUT t0

#define l0(x, y) V4(O(t1, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(8.283e-02, 5.262e-02, 1.580e-02, 4.991e-02, 6.836e-02, -3.234e-02, 5.630e-02, 1.275e-01, 5.398e-03, 9.866e-04, -1.054e-02, 1.601e-02, 1.546e-02, -7.786e-02, -2.630e-02, -3.023e-02));
	r += mul(s0_1, M4(9.285e-02, 3.403e-01, -4.572e-02, 1.431e-01, 2.876e-01, -3.271e-01, -8.133e-04, 5.998e-01, 4.515e-02, 9.836e-02, 2.315e-02, 1.724e-01, -8.080e-02, -1.978e-01, -5.366e-02, -4.535e-02));
	r += mul(s0_2, M4(1.708e-02, -8.374e-02, -1.831e-02, 1.744e-02, 4.902e-02, -1.037e-02, -3.508e-02, 3.501e-02, 1.160e-01, 2.529e-01, 4.235e-02, 4.233e-02, -5.953e-03, -1.398e-01, -8.815e-03, 1.053e-02));
	r += mul(s0_3, M4(-2.836e-03, -2.496e-01, 2.703e-02, 9.490e-02, 3.985e-01, -9.458e-02, 1.355e-01, 5.917e-01, 5.597e-03, -8.963e-02, 5.238e-02, 4.360e-02, -1.070e-01, 7.593e-02, 6.376e-02, -1.498e-01));
	r += mul(s0_4, M4(3.214e-01, -8.045e-01, 6.621e-01, -1.261e-01, -1.487e+00, 1.086e+00, 3.779e-01, -1.762e+00, 2.721e-01, -3.815e-02, -1.450e-01, 4.063e-01, 2.804e-01, 3.876e-01, 2.607e-01, 2.174e-01));
	r += mul(s0_5, M4(-3.896e-01, 3.340e-01, -2.529e-01, -6.519e-02, -1.815e-01, 5.542e-02, -1.669e-01, 1.732e-02, 2.995e-01, 4.942e-02, 6.557e-02, -1.386e-01, -1.392e-01, 2.822e-01, 2.016e-02, -1.313e-01));
	r += mul(s0_6, M4(-2.130e-02, 4.137e-02, 7.324e-02, 4.834e-03, 9.333e-02, -2.998e-01, 4.229e-01, 9.535e-02, -2.595e-02, 2.955e-02, 7.491e-02, -3.028e-02, -2.850e-02, 1.582e-02, -1.076e-01, -3.159e-02));
	r += mul(s0_7, M4(-3.601e-02, 5.993e-02, -1.190e-02, -6.800e-02, 6.894e-03, -2.095e-01, -9.548e-02, -2.539e-02, -2.390e-02, 2.947e-02, 1.581e-01, -5.305e-03, 1.029e-01, -1.456e-01, -3.526e-02, 9.251e-02));
	r += mul(s0_8, M4(-7.206e-02, 9.690e-02, -4.464e-02, -6.999e-03, 3.140e-02, -4.201e-02, -6.364e-03, 5.280e-03, -1.412e-01, 1.696e-01, -1.274e-01, -9.546e-02, 5.285e-02, -1.072e-01, 5.994e-02, 1.293e-02));
	r += mul(s1_0, M4(-1.808e-02, 1.243e-01, -6.814e-02, -4.219e-03, 1.273e-02, 2.752e-02, 3.764e-02, 3.650e-02, 7.663e-04, 6.843e-03, 1.380e-02, -3.235e-02, 5.400e-02, -5.352e-02, 1.190e-02, -1.028e-01));
	r += mul(s1_1, M4(2.568e-01, 2.764e-01, 7.740e-02, 1.273e-01, 7.059e-02, 6.668e-02, 4.211e-02, 6.293e-02, -4.164e-02, 2.210e-01, -1.293e-02, 8.369e-02, 2.046e-01, 1.238e-01, 9.491e-02, 4.614e-02));
	r += mul(s1_2, M4(-2.387e-02, 3.174e-01, 8.165e-02, -6.680e-02, -1.516e-02, 1.482e-02, -1.342e-02, 1.692e-02, -2.288e-02, -6.891e-02, -5.559e-02, 4.771e-02, 3.290e-02, 1.234e-01, 4.334e-02, -5.106e-02));
	r += mul(s1_3, M4(6.216e-02, -2.114e-01, -1.616e-01, 1.664e-01, 3.796e-02, 6.036e-02, -1.106e-01, 1.398e-01, -3.139e-02, -6.274e-02, 4.988e-02, -6.274e-02, 2.296e-02, -5.131e-02, 5.052e-02, -8.866e-02));
	r += mul(s1_4, M4(2.647e-01, -7.858e-01, 1.597e-01, -8.262e-01, -3.213e-01, 2.427e-01, 1.686e-01, -4.251e-01, 1.505e-01, 3.244e-02, 1.023e-01, 1.962e-01, -1.116e-01, 3.525e-01, 8.848e-01, -1.945e-01));
	r += mul(s1_5, M4(-2.549e-01, -1.429e-01, -3.696e-02, 3.042e-01, -1.256e-01, 2.760e-02, -3.650e-02, 7.985e-02, -1.958e-01, 3.076e-01, -9.253e-02, -8.512e-02, -1.708e-01, -3.422e-04, -8.181e-02, 2.319e-01));
	r += mul(s1_6, M4(-3.382e-02, 6.627e-02, 1.158e-01, -3.044e-02, -7.983e-03, -7.855e-02, 1.729e-02, 3.219e-04, -1.764e-02, 4.065e-02, -1.400e-02, -2.387e-02, 2.673e-03, 5.460e-03, -4.992e-02, -1.573e-02));
	r += mul(s1_7, M4(-2.505e-02, 1.763e-01, -4.433e-01, -1.024e-01, 1.391e-01, -2.435e-01, -5.358e-02, 5.203e-02, 3.157e-02, 2.012e-02, 7.424e-03, 3.723e-02, -2.388e-02, 7.204e-02, -4.522e-01, -1.187e-02));
	r += mul(s1_8, M4(9.737e-02, 7.067e-02, 4.072e-02, 4.303e-02, 2.890e-02, -1.810e-02, 5.156e-03, -1.953e-02, -3.503e-02, 7.492e-02, 1.402e-02, -9.796e-03, 2.320e-01, -2.135e-01, 1.462e-01, 1.194e-01));
	r += V4(-5.006e-05, -2.252e-04, -1.752e-03, 4.586e-04);
	return r;
}

void Pass9(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = Rmp8x8(tid.x) + blockStart;
	uint2 size = GetInputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = (gxy + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	t0[gxy] = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);
}

//!PASS 10
//!DESC out-shuffle
//!BLOCK_SIZE 16
//!NUM_THREADS 64
//!IN INPUT, t0
//!OUT OUTPUT

#define l0(x, y) V4(O(t0, float2(x, y)))

V4 f0(V4 s0_0, V4 s0_1, V4 s0_2, V4 s0_3, V4 s0_4, V4 s0_5, V4 s0_6, V4 s0_7, V4 s0_8, V4 s1_0, V4 s1_1, V4 s1_2, V4 s1_3, V4 s1_4, V4 s1_5, V4 s1_6, V4 s1_7, V4 s1_8) {
	V4 r = 0.0;
	r += mul(s0_0, M4(2.670e-02, -1.964e-03, 2.191e-02, 3.109e-02, 1.911e-02, -2.017e-02, -2.948e-02, -2.237e-02, -3.845e-02, -7.954e-03, -3.472e-02, -2.253e-02, -1.571e-02, -6.613e-03, -1.489e-02, -2.647e-02));
	r += mul(s0_1, M4(-6.714e-02, -2.106e-02, 7.577e-03, 1.788e-02, 8.081e-02, 8.813e-02, -5.510e-02, -2.724e-02, 1.150e-01, 5.284e-02, -8.964e-02, -3.024e-02, 5.215e-02, 5.334e-02, -1.180e-02, 6.927e-03));
	r += mul(s0_2, M4(1.036e-02, 1.826e-02, -8.095e-03, -9.967e-03, 1.368e-03, 3.479e-02, -1.887e-03, -2.161e-02, -3.464e-02, -1.124e-01, -4.623e-03, -5.295e-03, -7.199e-03, -4.285e-02, 8.862e-03, -1.610e-02));
	r += mul(s0_3, M4(2.388e-01, -1.001e-03, 1.699e-01, -4.519e-02, -3.274e-01, 1.550e-01, 3.748e-02, 3.435e-02, -1.655e-01, 1.227e-02, -1.372e-01, 4.700e-02, -1.636e-01, 1.222e-02, -1.323e-01, 3.239e-02));
	r += mul(s0_4, M4(1.698e-01, 4.561e-01, -1.355e-01, 1.831e-01, -3.815e-01, -7.832e-01, 1.738e-01, 4.516e-02, 2.803e-01, -4.239e-01, 8.945e-01, -1.339e-02, -3.701e-01, -3.731e-01, 1.765e-01, -1.343e-01));
	r += mul(s0_5, M4(-4.653e-02, -8.470e-02, -1.076e-03, -7.153e-02, 1.022e-02, -2.560e-02, -1.154e-02, 2.252e-02, -1.053e-01, 4.014e-01, -1.479e-01, 3.667e-01, 9.425e-02, -8.079e-02, 5.594e-03, 4.870e-02));
	r += mul(s0_6, M4(-6.274e-02, -3.430e-02, -5.955e-02, 1.220e-02, -6.075e-02, 1.284e-02, -8.384e-02, 2.143e-01, -2.050e-02, -8.887e-03, -1.445e-02, 1.797e-02, 1.436e-01, -8.067e-04, 1.013e-01, 3.847e-03));
	r += mul(s0_7, M4(6.862e-02, -7.230e-02, -2.461e-01, -3.760e-01, 4.038e-02, -2.634e-02, -2.725e-01, -4.389e-01, 9.088e-03, -1.873e-02, -9.497e-02, -1.860e-01, -1.038e-01, 2.502e-01, -6.194e-01, 4.470e-02));
	r += mul(s0_8, M4(-1.984e-02, 4.173e-02, 5.328e-02, 5.554e-02, 1.241e-03, -2.290e-03, 5.972e-02, 4.381e-02, -3.320e-03, -1.434e-04, -5.754e-02, -6.072e-02, -6.854e-03, 6.781e-02, 1.208e-01, -5.469e-02));
	r += mul(s1_0, M4(7.050e-02, -3.676e-02, 7.009e-03, 1.431e-02, -1.258e-02, -6.854e-03, -9.803e-04, 5.955e-03, -3.077e-03, -2.372e-02, 8.060e-03, -5.992e-02, -7.957e-02, 2.905e-02, 3.914e-04, -1.408e-02));
	r += mul(s1_1, M4(-1.068e-01, 4.589e-02, -1.399e-02, -8.157e-03, 1.811e-02, 7.241e-03, 9.447e-03, 3.242e-03, 5.152e-02, 8.667e-02, -2.512e-02, -2.978e-02, 1.382e-01, 5.481e-02, -2.199e-02, -2.739e-02));
	r += mul(s1_2, M4(3.676e-02, 1.705e-02, -4.520e-03, -6.449e-03, 1.006e-02, 9.807e-03, -6.046e-03, -1.299e-03, -5.035e-02, -4.415e-02, 9.619e-03, -1.059e-02, -6.952e-03, -1.803e-02, -4.042e-03, -1.751e-02));
	r += mul(s1_3, M4(5.123e-02, 4.500e-02, 2.099e-01, -7.254e-03, -7.977e-02, 2.822e-02, -1.546e-01, -3.748e-02, -2.378e-01, -1.836e-02, -3.508e-02, -2.147e-03, 3.371e-02, -4.720e-02, -5.574e-02, -1.592e-02));
	r += mul(s1_4, M4(-5.764e-01, 5.998e-01, -2.288e-01, 7.223e-01, -1.855e-01, -3.467e-01, 5.173e-02, -8.967e-02, 3.308e-01, -8.987e-02, 2.397e-01, 3.701e-01, -7.970e-02, -9.046e-01, 2.397e-01, -1.626e-01));
	r += mul(s1_5, M4(1.177e-02, -1.538e-01, 4.138e-02, -5.198e-02, 3.165e-03, 3.827e-02, -5.913e-03, 8.727e-03, 7.885e-02, 2.979e-01, -6.160e-02, 1.198e-01, 1.186e-02, 9.421e-02, -4.101e-02, 4.185e-03));
	r += mul(s1_6, M4(-7.690e-02, -4.820e-03, -1.106e-01, 4.040e-02, -6.883e-02, -3.284e-02, 1.259e-02, 1.509e-01, 6.378e-03, -5.293e-04, -3.690e-02, 6.274e-02, 1.401e-01, -3.801e-03, 1.489e-01, -1.044e-02));
	r += mul(s1_7, M4(1.140e-01, -1.333e-01, -1.739e-01, -1.739e-01, 4.736e-02, -1.306e-02, -3.673e-01, -6.127e-01, -3.477e-02, -6.090e-02, 2.430e-02, -2.666e-01, -6.599e-02, 2.794e-01, -1.724e-01, -2.744e-01));
	r += mul(s1_8, M4(1.045e-02, 6.106e-02, 3.463e-02, 6.708e-02, -1.028e-02, -2.277e-02, 6.536e-02, 8.227e-02, -5.566e-02, -3.941e-02, -6.862e-03, -1.219e-02, -1.438e-02, -4.651e-02, 5.359e-02, 4.650e-02));
	r += V4(-1.731e-03, -2.098e-03, -1.131e-03, -1.644e-03);
	return tanh(r);
}

void Pass10(uint2 blockStart, uint3 tid) {
	float2 pt = float2(GetInputPt());
	uint2 gxy = (Rmp8x8(tid.x) << 1) + blockStart;
	uint2 size = GetOutputSize();
	if (gxy.x >= size.x || gxy.y >= size.y) {
		return;
	}
	float2 pos = ((gxy >> 1) + 0.5) * pt;

	V4 s0_0 = l0(-1.0, -1.0);
	V4 s0_1 = l0(0.0, -1.0);
	V4 s0_2 = l0(1.0, -1.0);
	V4 s0_3 = l0(-1.0, 0.0);
	V4 s0_4 = l0(0.0, 0.0);
	V4 s0_5 = l0(1.0, 0.0);
	V4 s0_6 = l0(-1.0, 1.0);
	V4 s0_7 = l0(0.0, 1.0);
	V4 s0_8 = l0(1.0, 1.0);
	V4 s1_0 = -max(-s0_0, 0.0);
	V4 s1_1 = -max(-s0_1, 0.0);
	V4 s1_2 = -max(-s0_2, 0.0);
	V4 s1_3 = -max(-s0_3, 0.0);
	V4 s1_4 = -max(-s0_4, 0.0);
	V4 s1_5 = -max(-s0_5, 0.0);
	V4 s1_6 = -max(-s0_6, 0.0);
	V4 s1_7 = -max(-s0_7, 0.0);
	V4 s1_8 = -max(-s0_8, 0.0);
	s0_0 = max(s0_0, 0.0);
	s0_1 = max(s0_1, 0.0);
	s0_2 = max(s0_2, 0.0);
	s0_3 = max(s0_3, 0.0);
	s0_4 = max(s0_4, 0.0);
	s0_5 = max(s0_5, 0.0);
	s0_6 = max(s0_6, 0.0);
	s0_7 = max(s0_7, 0.0);
	s0_8 = max(s0_8, 0.0);

	V4 r = f0(s0_0, s0_1, s0_2, s0_3, s0_4, s0_5, s0_6, s0_7, s0_8, s1_0, s1_1, s1_2, s1_3, s1_4, s1_5, s1_6, s1_7, s1_8);

	static const float3x3 rgb2yuv = {0.299, 0.587, 0.114, -0.169, -0.331, 0.5, 0.5, -0.419, -0.081};
	static const float3x3 yuv2rgb = {1, -0.00093, 1.401687, 1, -0.3437, -0.71417, 1, 1.77216, 0.00099};
	float2 opt = float2(GetOutputPt());

	pos -= 0.5f * opt;
	float3 yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.x), yuv.yz)), 1);

	++gxy.x;
	pos.x += opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.y), yuv.yz)), 1);

	++gxy.y;
	pos.y += opt.y;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.w), yuv.yz)), 1);

	--gxy.x;
	pos.x -= opt.x;
	yuv = mul(rgb2yuv, INPUT.SampleLevel(SL, pos, 0).rgb);
	OUTPUT[gxy] = float4(mul(yuv2rgb, float3(saturate(yuv.r + r.z), yuv.yz)), 1);
}
